TeX 1995 July

home *** CD-ROM | disk | FTP | other *** search

/ TeX 1995 July / TeX CD-ROM July 1995 (Disc 1)(Walnut Creek)(1995).ISO / biblio / bibtex / utils / bibclean / match.c < prev next >

Wrap

C/C++ Source or Header | 1992-11-21 | 10KB | 394 lines

/*********************************************************************** ========== BACKGROUND ========== This file contains an Implementation of limited regular-expression pattern matching code. The pattern syntax is simpler, more limited, and different from normal regular-expression pattern matching syntax. It is described in more detail below. The motivation for this new code is that I found considerable inconsistency in the matching behavior between versions of either re_comp()/re_exec() or compile()/step() on these systems DECstation 3100 IBM 3090 IBM PS/2 IBM RS/6000 AIX 3.2 NeXT Mach 3.0 Silicon Graphics IRIX 4.0 Stardent OS 2.2 Sun SPARC That makes use of those regular-expression pattern matching unreliable across systems. One possible solution would be to use the GNU re_comp() and re_exec() from the regexp distribution on prep.ai.mit.edu (as of writing, pub/gnu/regex-0.11.*). However, that code is large (5000+ lines), and its installation uses configuration facilities that only work under some variants of UNIX, and are completely useless on other operating systems. By contrast, the pattern matching code here is quite adequate for bibclean's needs, and can be expressed in fewer than 140 lines. In addition, it provides special handling of TeX control sequences and braces that would be rather awkward to express in conventional regular-expression syntax. If the symbol TEST is defined at compile time, a main program will be included that can be used for testing patterns supplied from stdin. ============== PATTERN SYNTAX ============== The string values to be pattern-matched are tab-free single-line values delimited by quotation marks. The patterns are represented by the following special markers: a exactly one letter A one or more letters d exactly one digit D one or more digits w exactly one word (one or more letters and digits) W one or more space-separated words, beginning and ending with a word X one or more special-separated words, beginning and ending with a word . one special character (see SPECIAL_CHARS defined below) : one or more special characters <space> one or more spaces \x exactly one x (x is an character) x exactly the character x (x is anything but aAdDwW.:<space>\) Special characters are a subset of punctuation characters that are typically used in values. Note the \<space> represents a single literal space, \\ a single literal backslash, \a the letter a, \A the letter A, \d the letter d, \D the letter D, and so on. Remember to double all backslashes in C strings: \a must be entered as \\a, and "and" as "\\an\\d". Each pattern is matched against the entire string and must match successfully for a YES return from match_pattern(). Consequently, there is no need for an analogue of ^ and $ in full regular expressions. Neither is there provision for matching on arbitrary sets of characters. Instead, fixed sets of characters are provided (conventional regular-expression equivalents are shown in parentheses): digits ([0-9]), alphanumerics ([A-Za-z0-9]), space ([ \t\f\r\n\v]), and special ([][" !#()*+,-./:;?~]) In addition, TeX control sequences of the form \<one-special-character> or \<letter-sequence> in the string are ignored in the match, together with any following whitespace. Braces are also ignored, but not whitespace following them. Thus "{TR\slash A87}" matches the patterns "AD" and "W", and "{TR A\slash 87}" matches the patterns "A AD" and "A W". [11-Nov-1992] ***********************************************************************/ #include "os.h" #include "xstdlib.h" #include "xstring.h" #include "xctype.h" RCSID("$Id: match.c,v 1.2 1992/11/22 17:44:32 beebe Exp beebe $") /* $Log: match.c,v $ * Revision 1.2 1992/11/22 17:44:32 beebe * Change type of match_patterns from int to YESorNO for version 2.05 bibclean. * * Revision 1.1 1992/11/15 08:20:05 beebe * Initial revision * */ #define NEW_STYLE (__cplusplus || __STDC__ || c_plusplus) #if NEW_STYLE #define VOID void #else /* K&R style */ #define VOID #endif /* NEW_STYLE */ #if NEW_STYLE typedef enum { NO = 0, YES = 1 } YESorNO; #else /* K&R style */ #define NO 0 /* must be FALSE (zero) */ #define YES 1 /* must be TRUE (non-zero) */ typedef int YESorNO; #endif /* NEW_STYLE */ #include "match.h" #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 #endif #define SPECIAL_CHARS " !#()*+,-./:;?[]~" #define isspecial(c) (strchr(SPECIAL_CHARS,(c)) != (char*)NULL) static const char *next_s ARGS((const char *s_)); #if NEW_STYLE YESorNO match_pattern(const char *s, const char *pattern) #else /* K&R style */ YESorNO match_pattern(s,pattern) const char *s; const char *pattern; #endif /* NEW_STYLE */ { s = next_s(s-1); for ( ; *pattern; ++pattern) { switch(*pattern) { case 'a': /* single letter */ if (!isalpha(*s)) return (NO); s = next_s(s); break; case 'w': /* one word (letters and digits) */ if (!isalnum(*s)) return (NO); while (isalnum(*s)) s = next_s(s); break; case 'A': /* one or more letters */ if (!isalpha(*s)) return (NO); while (isalpha(*s)) s = next_s(s); break; case 'd': if (!isdigit(*s)) /* single digit */ return (NO); s = next_s(s); break; case 'D': /* one or more digits */ if (!isdigit(*s)) return (NO); while (isdigit(*s)) s = next_s(s); break; case 'W': /* one or more space-separated words */ if (!isalnum(*s)) return (NO); while (isalnum(*s)) /* parse first word */ s = next_s(s); for (;;) { if (!isspace(*s)) break; while (isspace(*s)) /* parse separators */ s = next_s(s); while (isalnum(*s)) /* parse another word */ s = next_s(s); } break; case 'X': /* one or more special-separated words */ if (!isalnum(*s)) return (NO); while (isalnum(*s)) /* parse first word */ s = next_s(s); for (;;) { if (!isspecial(*s)) break; while (isspecial(*s)) /* parse separators */ s = next_s(s); while (isalnum(*s)) /* parse another word */ s = next_s(s); } break; case ' ': /* one or more whitespace characters */ if (!isspace(*s)) return (NO); while (isspace(*s)) s = next_s(s); break; case '.': /* exactly one special character */ if (!isspecial(*s)) return (NO); break; case ':': /* one or more special characters */ if (!isspecial(*s)) return (NO); while (isspecial(*s)) s = next_s(s); break; case '\\': /* literal next character */ pattern++; /* fall through to exact match test */ default: /* anything else: exact match */ if (*pattern != *s) return(NO); s = next_s(s); } /* end switch */ } /* end for (; ;) */ return (*s == '\0' ? YES : NO); /* YES if reached end of string */ } #if NEW_STYLE static const char * next_s(const char *s) #else /* K&R style */ static const char * next_s(s) const char *s; #endif /* NEW_STYLE */ { /* find next position in s, ignoring braces and ignoring TeX control sequences and any space that follows them */ for (++s; (*s == '\\') || (*s == '{') || (*s == '}') ; ) { switch (*s) { case '\\': /* TeX control sequence */ ++s; /* look at next character */ if (isalpha(*s)) /* \<one-or-more-letters> */ { while (isalpha(*s)) ++s; } else /* \<non-letter> */ ++s; while (isspace(*s)) /* advance over trailing whitespace */ ++s; /* since TeX does too */ break; case '{': case '}': ++s; break; default: return (s); } /* end switch */ } /* end for */ return (s); } #ifdef TEST #define MAXLINE 256 #define NO_WARNING (const char *)NULL MATCH_PATTERN year_patterns[] = { {"\"DDDD\"", NO_WARNING}, {"\"DDDD,WDDDD\"", NO_WARNING}, {"\"DDDD, DDDD, DDDD\"", NO_WARNING}, {(const char*)NULL, NO_WARNING}, }; MATCH_PATTERN number_patterns[] = { {"\"D\"", "23"}, {"\"A AD\"", "PN LPS5001"}, {"\"A D(D)\"", "RJ 34(49)"}, {"\"A D\"", "XNSS 288811"}, {"\"A D\\.D\"", "Version 3.20"}, {"\"A-A-D-D\"", "UMIAC-TR-89-11"}, {"\"A-A-D\"", "CS-TR-2189"}, {"\"A-A-D\\.D\"", "CS-TR-21.7"}, {"\"A-AD-D\"", "TN-K\\slash 27-70"}, {"\"A-D D\"", "PB-251 845"}, {"\"A-D-D\"", "ANL-30-74"}, {"\"A-D\"", "TR-2189"}, {"\"AD-D-D\"", "GG24-3611-00"}, {"\"AD-D\"", "SP43-29"}, {"\"AD\"", "LPS0064"}, {"\"A\\#D-D\"", "TR\\#89-24 ????"}, {"\"D \\an\\d D\"", "11 and 12"}, {"\"D+D\"", "3+4"}, {"\"D-D\"", "23-27"}, {"\"D/D\"", "23/27"}, {"\"DA\"", "23A"}, {"\"D\\.D\"", "3.4"}, {"\"W-W W\"", "AERE-R 12329"}, {"\"W-W-WW-W\"", "OSU-CISRC-4\\slash 87-TR9"}, {"\"W\"", "Computer Science Report 100"}, {"\"X\"", "TR/AB/3-43.7-3/AB"}, {(const char*)NULL, NO_WARNING}, }; int main ARGS((int argc,char* argv[])); static void process ARGS((const char *line_, MATCH_PATTERN patterns_[])); #if NEW_STYLE int main(int argc, char* argv[]) #else /* K&R style */ int main(argc,argv) int argc; char* argv[]; #endif /* NEW_STYLE */ { char line[MAXLINE]; while (fgets(line,MAXLINE,stdin) != (char*)NULL) { char *p = strchr(line,'\n'); if (p != (char *)NULL) *p = '\0'; process(line,number_patterns); } exit (EXIT_SUCCESS); return (EXIT_SUCCESS); } #if NEW_STYLE static void process(const char *line, MATCH_PATTERN patterns[]) #else /* K&R style */ static void process(line,patterns) const char *line; MATCH_PATTERN patterns[]; #endif /* NEW_STYLE */ { int k; for (k = 0; patterns[k].pattern != (const char*)NULL; ++k) { if (match_pattern(line,patterns[k].pattern) == YES) { if (patterns[k].message != NO_WARNING) printf("%%%% [%-24s]: %s\n", line, patterns[k].message); return; } } printf("?? [%-24s]: Illegal value\n", line); } #endif /* TEST */